package com.gourd.common.utils;

import lombok.extern.slf4j.Slf4j;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.LayeredConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLContexts;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import javax.net.ssl.SSLContext;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.KeyStore;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;

/**
 * JsoupHttpClient工具类
 *
 * @author gourd
 */
@Slf4j
public class JsoupHttpClientUtil {

    private final static String DEFAULT_CHARSET = "utf8";

    /**
     * 根据url获取网页源码
     *
     * @param url 需要爬取的网页URL
     * @return
     */
    public static String getHtmlByUrl(String url) {
        // 获取到的网页源码
        String html = null;
        // 建立一个新的请求客户端
        CloseableHttpClient httpClient = null;
        if (url.startsWith("https://")) {
            httpClient = getHttpsClient();
        } else {
            httpClient = HttpClients.createDefault();
        }

        // 使用HttpGet的方式请求网址
        HttpGet httpGet = new HttpGet(url);
        // 模拟浏览器
        httpGet.setHeader("User-Agent",
                "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:50.0) Gecko/20100101 Firefox/50.0");

        // 使用代理 IP
//        HttpHost proxy = new HttpHost("118.114.77.47", 8080);
        RequestConfig config = RequestConfig.custom()
                .setConnectionRequestTimeout(10000)
                // 设置连接超时时间 10秒钟
                .setConnectTimeout(10000)
                // 设置读取超时时间10秒钟
                .setSocketTimeout(10000)
                .build();
        httpGet.setConfig(config);
        // 获取网址的返回结果
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            if (response != null && response.getStatusLine().getStatusCode() == 200) {
                //获取返回结果中的实体
                HttpEntity entity = response.getEntity();
                //将返回的实体输出
                html = EntityUtils.toString(entity, DEFAULT_CHARSET);
            } else {
                log.error("获取网页源码失败");
            }
        } catch (IOException e) {
            log.error("获取网页源码异常：", e);
        } finally {
            // 关闭资源
            try {
                if (response != null) {
                    response.close();
                }
            } catch (IOException e) {
                log.error("response关闭错误，请检查原因");
            }
            try {
                if(httpClient != null){
                    httpClient.close();
                }
            } catch (IOException e) {
                log.error("httpClient关闭错误，请检查原因");
            }
        }
        return html;
    }

    /**
     * 根据html获取元素
     *
     * @param html    网页源码
     * @param tagName 标签名
     * @return
     */
    public static Elements getElements(String html, String tagName) {
        // 解析网页 得到文档对象
        Document doc = Jsoup.parse(html);
        // 获取tag是tagName的所有DOM元素，数组
        Elements elements = doc.getElementsByTag(tagName);
        return elements;
    }


    /**
     * 获取https连接（不验证证书）
     *
     * @return
     */
    private static CloseableHttpClient getHttpsClient() {
        RegistryBuilder<ConnectionSocketFactory> registryBuilder = RegistryBuilder.<ConnectionSocketFactory>create();
        ConnectionSocketFactory plainSF = new PlainConnectionSocketFactory();
        registryBuilder.register("http", plainSF);
        // 指定信任密钥存储对象和连接套接字工厂
        try {
            KeyStore trustStore = KeyStore.getInstance(KeyStore.getDefaultType());
            // 信任任何链接
            TrustStrategy anyTrustStrategy = new TrustStrategy() {

                @Override
                public boolean isTrusted(java.security.cert.X509Certificate[] arg0, String arg1) throws java.security.cert.CertificateException {
                    return true;
                }
            };
            SSLContext sslContext = SSLContexts.custom().useTLS().loadTrustMaterial(trustStore, anyTrustStrategy).build();
            LayeredConnectionSocketFactory sslSF = new SSLConnectionSocketFactory(sslContext, SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
            registryBuilder.register("https", sslSF);
        } catch (KeyStoreException e) {
            throw new RuntimeException(e);
        } catch (KeyManagementException e) {
            throw new RuntimeException(e);
        } catch (NoSuchAlgorithmException e) {
            throw new RuntimeException(e);
        }
        Registry<ConnectionSocketFactory> registry = registryBuilder.build();
        // 设置连接管理器
        PoolingHttpClientConnectionManager connManager = new PoolingHttpClientConnectionManager(registry);
        // 构建客户端
        return HttpClientBuilder.create().setConnectionManager(connManager).build();
    }


}
